org_df <- read_excel("wuhan_blood_sample_data_Jan_Feb_2020.xlsx")

Dataset cleaning

The following steps were undertaken to clean the original dataset:
  • replace the gender values, from numeric (1, 2) to factor (male, female),
  • replace the outcome values, from numeric (0,1) to factor (survived, Died),
  • unification of column name from ‘Admission time’ to admission_time,
  • unification of column name from ‘Discharge time’ to discharge_time,
  • replace NA values in PATIENT_ID with suitable id (done in next chapter).
df <- org_df %>% 
        mutate(gender = as.factor(ifelse(gender==1, "male", "female"))) %>%
        mutate(outcome = as.factor(ifelse(outcome == 0, "Survived", "Died"))) %>%
        rename(admission_time = 'Admission time',
               discharge_time = 'Discharge time',
               hs_CRP = 'High sensitivity C-reactive protein')

names(df)[34] <- "Tumor necrosis factor alpha"
names(df)[37] <- "Interleukin 1 beta"
names(df)[68] <- "Gamma glutamyl transpeptidase"

Dataset summary

The dataset consists of 81 variables and has 6120 observations (blood tests). See the summary below:

summary_df <- df %>% select(outcome, gender)
tbl_summary(
  summary_df,
  by = outcome,
  label = gender ~ "Gender") %>% 
  modify_header(label ~ "**Variable**") %>% 
  add_overall() %>%
  as_kable() %>%   kable_paper("hover")
Variable Overall, N = 6,120 Died, N = 2,905 Survived, N = 3,215
Gender
female 2,390 (39%) 751 (26%) 1,639 (51%)
male 3,730 (61%) 2,154 (74%) 1,576 (49%)

The blood tests were taken from 375 different patients.

df %>% select(PATIENT_ID, gender, outcome) %>% 
  drop_na(PATIENT_ID) %>% 
  select(-PATIENT_ID) %>%  
  tbl_summary(label =  gender ~ "Gender", by = outcome) %>%
  add_overall() %>% 
  modify_header(label ~ "**Variable**") %>%
  as_kable() %>%   kable_paper("hover")
Variable Overall, N = 375 Died, N = 174 Survived, N = 201
Gender
female 151 (40%) 48 (28%) 103 (51%)
male 224 (60%) 126 (72%) 98 (49%)

The dataset was split into two dataframes Patients and Blood tests in order to make data analysis easier.

Patients

One additional column was created to store the hospitalization time of all patients - used in further analysis to check the relation between length stay and outcome. Go to section Patients Visualization to see basic visualizations about patients.

patients <- df %>% 
              select(PATIENT_ID, age, gender, admission_time, discharge_time, outcome) %>% 
              drop_na(PATIENT_ID) %>%
              mutate("hospitalization_length" = round((difftime(discharge_time, admission_time, units = "days") ), digits = 2)) %>%
              relocate(hospitalization_length, .after = discharge_time)

head(patients) %>%
  kbl() %>%
  kable_paper("hover")
PATIENT_ID age gender admission_time discharge_time hospitalization_length outcome
1 73 male 2020-01-30 22:12:47 2020-02-17 12:40:09 17.60 days Survived
2 61 male 2020-02-04 21:39:03 2020-02-19 12:59:01 14.64 days Survived
3 70 female 2020-01-23 10:59:36 2020-02-08 17:52:31 16.29 days Survived
4 74 male 2020-01-31 23:03:59 2020-02-18 12:59:12 17.58 days Survived
5 29 female 2020-02-01 20:59:54 2020-02-18 10:33:06 16.56 days Survived
6 81 female 2020-01-24 10:47:10 2020-02-07 09:06:58 13.93 days Survived

Blood tests

blood_tests_df <- df %>%
  select(-c(admission_time, discharge_time)) %>%
  fill(PATIENT_ID)


markers_df <- blood_tests_df %>% select (-c(PATIENT_ID, age, RE_DATE, gender))

  tbl_summary(
    markers_df,
    by = outcome,
    missing = "no") %>% 
    modify_header(label = "**Marker**") %>%
    add_n() %>%
    bold_labels() %>%
    as_kable() %>%  
    kable_paper("hover") %>% 
    scroll_box(width = "100%", height = "200px")
Marker N Died, N = 2,905 Survived, N = 3,215
Hypersensitive cardiac troponinI 507 70 (18, 631) 3 (2, 7)
hemoglobin 975 123 (110, 135) 127 (116, 138)
Serum chloride 975 104 (100, 111) 101 (99, 103)
Prothrombin time 662 16.3 (15.0, 18.2) 13.6 (13.1, 14.1)
procalcitonin 459 0.38 (0.14, 1.13) 0.04 (0.02, 0.06)
eosinophils(%) 957 0.00 (0.00, 0.10) 0.70 (0.00, 1.80)
Interleukin 2 receptor 268 1,180 (807, 1,603) 529 (400, 742)
Alkaline phosphatase 930 83 (64, 123) 60 (50, 75)
albumin 934 28 (24, 31) 36 (34, 39)
basophil(%) 957 0.10 (0.10, 0.20) 0.20 (0.10, 0.40)
Interleukin 10 267 11 (6, 17) 5 (5, 8)
Total bilirubin 930 14 (10, 25) 8 (6, 12)
Platelet count 957 112 (55, 174) 229 (176, 290)
monocytes(%) 958 3.0 (2.0, 4.7) 8.2 (6.3, 10.0)
antithrombin 330 80 (70, 92) 93 (86, 103)
Interleukin 8 268 30 (18, 61) 11 (7, 19)
indirect bilirubin 906 6.2 (4.2, 9.2) 4.9 (3.4, 7.1)
Red blood cell distribution width 923 13.20 (12.40, 14.40) 12.20 (11.80, 12.80)
neutrophils(%) 957 92 (88, 95) 66 (56, 76)
total protein 931 62 (57, 68) 68 (65, 72)
Quantification of Treponema pallidum antibodies 279 0.06 (0.04, 0.07) 0.05 (0.04, 0.07)
Prothrombin activity 659 66 (56, 78) 94 (88, 103)
HBsAg 279 0.01 (0.00, 0.02) 0.00 (0.00, 0.01)
mean corpuscular volume 957 91.3 (87.1, 96.4) 89.8 (86.8, 91.9)
hematocrit 957 35.9 (32.5, 39.8) 37.1 (34.3, 39.9)
White blood cell count 1,127 12 (8, 17) 6 (4, 8)
Tumor necrosis factor alpha 268 11 (8, 17) 8 (6, 10)
mean corpuscular hemoglobin concentration 957 342 (331, 350) 343 (335, 350)
fibrinogen 566 3.92 (2.44, 5.63) 4.40 (3.56, 5.34)
Interleukin 1 beta 268 5.0 (5.0, 5.0) 5.0 (5.0, 5.0)
Urea 936 11 (7, 17) 4 (3, 5)
lymphocyte count 957 0.46 (0.31, 0.69) 1.25 (0.87, 1.62)
PH value 384 6.50 (6.00, 7.41) 6.50 (6.00, 7.00)
Red blood cell count 1,127 4.0 (3.6, 4.6) 4.2 (3.8, 4.7)
Eosinophil count 957 0.00 (0.00, 0.01) 0.03 (0.00, 0.09)
Corrected calcium 914 2.35 (2.27, 2.44) 2.37 (2.27, 2.44)
Serum potassium 980 4.60 (4.04, 5.27) 4.28 (3.92, 4.62)
glucose 775 9.1 (6.9, 13.3) 5.7 (5.0, 7.6)
neutrophils count 957 10.8 (7.0, 15.2) 3.5 (2.4, 5.2)
Direct bilirubin 930 8 (5, 14) 4 (2, 5)
Mean platelet volume 862 11.30 (10.70, 12.20) 10.40 (9.90, 11.00)
ferritin 283 1,636 (928, 2,517) 504 (235, 834)
RBC distribution width SD 923 43.7 (39.9, 48.5) 39.5 (37.6, 41.4)
Thrombin time 566 17.30 (15.80, 19.75) 16.40 (15.60, 17.30)
(%)lymphocyte 958 4 (2, 7) 24 (16, 33)
HCV antibody quantification 279 0.07 (0.04, 0.11) 0.06 (0.04, 0.08)
D-D dimer 630 19 (3, 21) 1 (0, 1)
Total cholesterol 931 3.32 (2.72, 3.88) 3.93 (3.39, 4.48)
aspartate aminotransferase 935 38 (25, 59) 21 (17, 29)
Uric acid 934 245 (166, 374) 240 (193, 304)
HCO3- 934 21.8 (18.8, 24.7) 24.7 (22.8, 26.7)
calcium 979 2.00 (1.90, 2.08) 2.17 (2.10, 2.25)
Amino-terminal brain natriuretic peptide precursor(NT-proBNP) 475 1,467 (516, 4,578) 64 (23, 166)
Lactate dehydrogenase 934 593 (431, 840) 220 (189, 278)
platelet large cell ratio 862 35 (30, 42) 28 (23, 33)
Interleukin 6 272 66 (30, 142) 8 (2, 21)
Fibrin degradation products 330 114 (18, 150) 4 (4, 4)
monocytes count 957 0.36 (0.20, 0.58) 0.43 (0.32, 0.58)
PLT distribution width 862 13.60 (12.10, 15.93) 11.70 (10.70, 13.00)
globulin 930 34.1 (30.2, 38.2) 31.8 (29.5, 35.2)
Gamma glutamyl transpeptidase 930 42 (27, 79) 29 (19, 46)
International standard ratio 659 1.31 (1.17, 1.48) 1.04 (0.99, 1.09)
basophil count(#) 957 0.010 (0.010, 0.030) 0.010 (0.010, 0.020)
2019-nCoV nucleic acid detection 501
-1 57 (100%) 444 (100%)
mean corpuscular hemoglobin 957 31.20 (29.90, 32.70) 30.70 (29.60, 31.90)
Activation of partial thromboplastin time 568 40 (36, 45) 39 (35, 43)
hs_CRP 737 114 (65, 191) 7 (2, 35)
HIV antibody quantification 278 0.08 (0.07, 0.11) 0.09 (0.08, 0.11)
serum sodium 975 142 (138, 148) 140 (138, 141)
thrombocytocrit 862 0.15 (0.10, 0.21) 0.24 (0.19, 0.30)
ESR 383 36 (16, 59) 26 (13, 40)
glutamic-pyruvic transaminase 931 26 (18, 44) 21 (15, 36)
eGFR 936 72 (43, 91) 100 (85, 114)
creatinine 936 88 (68, 130) 64 (54, 83)
ml_df <- blood_tests_df %>% 
  select(-RE_DATE) %>%
  group_by(PATIENT_ID) %>% 
  summarise(across(everything(), function(x) last(na.omit(x)))) %>% 
  na_mean(option = "median") %>% 
  select(-PATIENT_ID)

View(ml_df)

Visualization

Patients by gender

ggplot(patients, aes(x = gender, fill = gender)) +
  geom_bar() + 
  labs(y = "Number of patients", 
       x = "Gender")

Patients age by gender

patients_hist <- ggplot(patients, aes(x = age, fill = gender)) +
  geom_histogram(stat = "count",
                 binwidth = 1.2)+
  labs(y = "Number of patients", 
       x = "Age") +
  scale_x_continuous(breaks=seq(20, 100, 5))
 
ggplotly(patients_hist)      

Outcome grouped by gender, age

layout_ggplotly <- function(gg, x = -0.05, y = -0.05){
  # The 1 and 2 goes into the list that contains the options for the x and y axis labels respectively
  gg[['y']][['layout']][['annotations']][[1]][['y']] <- x
  gg[['y']][['layout']][['annotations']][[2]][['x']] <- y
  gg
}

patients_outcome <- ggplot(patients, aes(x = age, fill = outcome)) + 
                    geom_histogram(binwidth = 1.2) +
                    facet_grid(~ gender) +
                    scale_y_continuous(breaks=seq(0, 20, 2)) +
                    scale_x_continuous(breaks=seq(20, 100, 5)) +
                    labs(y = "Number of patients", x = "Age")

ggplotly(patients_outcome)

Outcome due to hospitalization length grouped by gender

hospitalization_length_plot <- ggplot(patients, aes(x = hospitalization_length, fill = outcome)) + 
      geom_histogram(binwidth = 1.2) +
      facet_grid(outcome ~ gender) +
      scale_y_continuous(breaks=seq(0, 20, 2)) +
      scale_x_continuous(breaks=seq(0, 40, 5)) +
      labs(y = "Number of patients", 
           x = "Hospitalization length [days]")

ggplotly(hospitalization_length_plot)

Died in specific days

outcome_per_day <- patients %>% 
                    mutate(discharge_time = as.Date(discharge_time)) %>%
                    filter(outcome == "Died")
                  
outcome_per_day_plot <- ggplot(outcome_per_day, aes(x = discharge_time, fill = outcome)) + 
  geom_histogram(binwidth = 1.2) + 
  facet_grid(~ gender) +  
  labs(x = "Discharge date", y = "Number of deaths") + 
  theme(legend.position = "none")

ggplotly(outcome_per_day_plot)

Dead cases during the day

outcome_during_day_plot <- patients %>%  
  mutate(time_h_m = hms(format(patients$discharge_time, format = "%H:%M:%S"))) %>% 
  mutate(time_h_m = (hour(time_h_m) + minute(time_h_m)/60)) %>%
  filter(outcome == "Died") %>%
  ggplot(aes(x = time_h_m, fill = "blue")) + 
  geom_histogram(binwidth = 1.2) + 
  scale_x_continuous(breaks = seq(0, 24, by = 1)) + 
  labs(x = "Number of dead cases", y = "Time of the day")+
  theme(legend.position = "none")

ggplotly(outcome_during_day_plot)

# Variables correlation Preparing the dataset for correlation (changing factor variables to numeric).

cor_df <- df %>%
            select(-c(PATIENT_ID, RE_DATE, admission_time, discharge_time)) %>%
            mutate(outcome = ifelse(df$outcome == "Died", 1, 0)) %>%
            mutate(gender = ifelse(df$gender == "male", 1, 0)) %>%
            rename(male = gender)

correlationMatrix <-  correlate(cor_df[sapply(cor_df, is.numeric)], use='pairwise.complete.obs')

Age correlation

From the previous analysis, it is known that elderly people are more susceptible to die due to Cocvid-19. Below short summary, what biomarkers are highly correlated with age.

age_correlation <- correlationMatrix %>% 
  focus(age) %>% 
  mutate(age = abs(age)) %>%
  arrange(desc(age)) %>% 
  filter(rowname != "outcome") %>% head 

age_correlation %>% kbl() %>% kable_paper("hover")
rowname age
eGFR 0.6295013
(%)lymphocyte 0.5468574
neutrophils(%) 0.5216471
albumin 0.5077106
Urea 0.4233471
hs_CRP 0.4007634

The most correlated is eGFR which is used to measure the the effectiveness of the work of the kidneys. Its hard to present a norm value, because this marker depends on many factors like gender, age, body mass, but some sources show that value above 90 is proper. Too low ,and too high value of GFR in some cases indicates kidney diseases which affect the blood filtration.

Below a chart presents the GFR value between patients in different age, grouped by outcome. It’s analysis shows, that many elderly patients that died, had some abnormalities in the work of the kidneys.

ggplot(ml_df, aes(x = age, y = `eGFR`, color = outcome)) + 
  geom_point() + 
  theme(legend.position = c(0.9,0.9)) + 
  ylim(0 , 150)

The next two high correlated biomarkers are related to immune system. The values of lymphocyte and neutrophils show how strong the organism is and how well it fights with the disease.

Lymphocytes are cells responsible for protecting our body (by creating anitbodies) from viruses, bacteria and other disease causing factors. The norm value for an adult is between 15 - 40%. Lower lymphocytes levels means, that the body cannot fight the disease. The left chart below confirms, that elderly people have weaker immune system and it’s hard for their organism to fight the disease.

plot1 <- ggplot(ml_df, aes(x = age, y = `(%)lymphocyte`, color = outcome)) + geom_point() + theme(legend.position = "none")
plot2 <- ggplot(ml_df, aes(x = `lymphocyte count`, y = `(%)lymphocyte`, color = outcome)) + 
  geom_point() + 
  theme(legend.position = c(0.8, 0.2)) +
  xlim(0,3.75)
grid.arrange(plot1, plot2, ncol=2)

Neutrophils are essential part of immune system - this cells search for pathogens in organisms and destroy them. High value of neutrophils(%) results in many neutrophil cells in blood (right plot below), which means that a medical condition occurs in patients body and that the immune system fights it.

This correlation explains that elderly people are more vulnerable, and their immune systems need to produce more neutrophils to fight the pathogens than in younger patients. The left plot shows that some of the tested patients had some medical condition, due to increased amount of neutrophils. Adding the information about the outcome, confirms that elderly patients are more likely to die because of Covid-19.

plot1 <- ggplot(ml_df, aes(x = age, y = `neutrophils(%)`, color = outcome)) + geom_point() + theme(legend.position = "none")
plot2 <- ggplot(ml_df, aes(x = `neutrophils count`, y = `neutrophils(%)`, color = outcome)) + geom_point() + theme(legend.position = c(0.8, 0.2))
grid.arrange(plot1, plot2, ncol=2)

Outcome correlation

The following section is devoted to check the correlation between biomarkers and the outcome.

The correlation matrix for the highest correlated variables and the numeric correlation values are shown below.

'%ni%' <- Negate('%in%')

outcome_cor <- correlationMatrix %>% 
  focus(outcome) %>% 
  mutate(outcome = abs(outcome)) %>%
  arrange(desc(outcome)) %>% 
  filter(`rowname` %ni% c('neutrophils(%)', 'neutrophils count')) %>% 
  mutate(outcome = round(outcome,2)) %>%
  head

outcome_corr_df <- cor_df %>% select(c(outcome_cor$rowname, outcome)) 

outcome_cor_matrix <- cor(outcome_corr_df[sapply(outcome_corr_df, is.numeric)], use='pairwise.complete.obs')

corrplot(outcome_cor_matrix)

The previous sections contains the analysis about lymphocytes and how important they are when fighting with the disease, that’s why they won’t be considered in this section.

outcome_cor %>% kbl() %>% kable_paper("hover") 
rowname outcome
(%)lymphocyte 0.72
albumin 0.69
Prothrombin activity 0.66
hs_CRP 0.65
D-D dimer 0.64
Lactate dehydrogenase 0.62

Below in each tab are presented the values of each biomarkers for all the patients grouped by age and outcome. Analysis of theses data shows, that all the biomarkers are also somehow correlated with the age, because the biomarkers values for eldery are very often (in this 5 biomarkers) outstanding from the values for people less than 50 years. This statement is confirmed by the boxplots below every chart, preseting the distribution of the biomarkers grouped by age group (adult - less than 64 years, eldery - more than 64 years), gender and outcome.

layout_ggplotly <- function(gg, x = -0.02, y = -0.05){
  # The 1 and 2 goes into the list that contains the options for the x and y axis labels respectively
  gg[['x']][['layout']][['annotations']][[1]][['y']] <- x
  gg[['x']][['layout']][['annotations']][[2]][['x']] <- y
  gg
}

Albumin

ggplot(ml_df, aes(x = age, y = `albumin`, color = outcome)) + geom_point()

albumin_plot <- ml_df %>% 
  mutate(age_group = as.factor(ifelse(ml_df$age < 64, 'adult', 'elderly'))) %>% 
  ggplot(aes(x= age_group, y = `albumin`, fill = gender)) +
  geom_boxplot(na.rm=TRUE) +  facet_grid(~outcome) + 
  labs(x = "Age group", y = "Albumin")

ggplotly(albumin_plot) %>% layout(boxmode = "group") %>% layout_ggplotly

Prothrombin activity

ggplot(ml_df, aes(x = age, y = `Prothrombin activity`, color = outcome)) + geom_point()

pt_plot <- ml_df %>% 
  mutate(age_group = as.factor(ifelse(ml_df$age < 64, 'adult', 'elderly'))) %>% 
  ggplot(aes(x= age_group, y = `Prothrombin activity`, fill = gender)) +
  geom_boxplot(na.rm=TRUE) +  facet_grid(~outcome) + 
  labs(x = "Age group", y = "Prothrombin activity")

ggplotly(pt_plot) %>% layout(boxmode = "group") %>% layout_ggplotly

Hs-CRP

A norm value for hs-CRP is about 50. All values above that level indicate some kind of inflammation in the body. Many values on the first plot are much more above the norm level showing very strong inflammation which eventually (probably) contributed to the death.

ggplot(ml_df, aes(x = age, y = hs_CRP, color = outcome)) + geom_point()

crp_plot <- ml_df %>%
  mutate(age_group = as.factor(ifelse(ml_df$age < 64, 'adult', 'elderly'))) %>%
  ggplot(aes(x= age_group, y = hs_CRP, fill = gender)) +
  geom_boxplot(na.rm=TRUE) +  facet_grid(~outcome) +
  labs(x = "Age group", y = "High sensitivity C-reactive protein")

ggplotly(crp_plot) %>% layout(boxmode = "group") %>% layout_ggplotly

D-D dimer

D-diemrs are cells responsible for decomposition of a clot. Their high value mean that there was a blood clot in the organism. Sometimes it can be linked with myocardial infarction, pulmonary embolism which combined with Covid-19 symptoms can lead to death.

ggplot(ml_df, aes(x = age, y = `D-D dimer`, color = outcome)) + geom_point()

dimer_plot <- ml_df %>% 
  mutate(age_group = as.factor(ifelse(ml_df$age < 64, 'adult', 'elderly'))) %>% 
  ggplot(aes(x= age_group, y = `D-D dimer`, fill = gender)) +
  geom_boxplot(na.rm=TRUE) +  facet_grid(~outcome) + 
  labs(x = "Age group", y = "D-D dimer")

ggplotly(dimer_plot) %>% layout(boxmode = "group") %>% layout_ggplotly

Lactate dehydrogenase

ggplot(ml_df, aes(x = age, y = `Lactate dehydrogenase`, color = outcome)) + geom_point()

ldh_plot <- ml_df %>% 
  mutate(age_group = as.factor(ifelse(ml_df$age < 64, 'adult', 'elderly'))) %>% 
  ggplot(aes(x= age_group, y = `Lactate dehydrogenase`, fill = gender)) +
  geom_boxplot(na.rm=TRUE) +  facet_grid(~outcome) + 
  labs(x = "Age group", y = "Lactate dehydrogenase")

ggplotly(ldh_plot) %>% layout(boxmode = "group") %>% layout_ggplotly

Animation

patients_agg <- patients %>% select(c(discharge_time, outcome)) %>%
  mutate(discharge_time = as.Date(patients$discharge_time, "%m/%d/%Y" )) %>%
  filter(outcome == 'Died') %>%
  group_by(discharge_time) %>%
  summarise(deaths_count = n(), .groups="drop") %>%
  arrange(discharge_time) %>%
  mutate(deaths_count_agg = cumsum(deaths_count))

ggplot(patients_agg, aes(x = discharge_time, y = deaths_count_agg)) + 
  geom_line(size  = 1.1, color = 'red') + 
  transition_reveal(discharge_time) + 
  labs(x = "Discharde time", y = "Deaths count aggregate") + 
  scale_x_continuous(breaks = seq(min(patients_agg$discharge_time), max(patients_agg$discharge_time), 10))

Classification model

In this chapter 3 classification models are trained to predict the outcome (death/survival) of COVID-19 sick patients based on basic patients observations and blood test samples. One blood test for each patient is considered as an observation for the machine learning algorithms. Extra data pre processing was needed to prepare the dataset. For each patient, all the blood test are reduced to one row, containing the closest value to the discharge time. Missing values for some variables were replaced with median, due to very unsymmetrical distribution.

Redundant columns like patient id, blood test time and admission and discharge time were removed from the dataset.

The preprocessed data is split into two datasets training and testing.

set.seed(23)
rows <- sample(nrow(ml_df))
ml_df <- ml_df[rows,]


set.seed(23)
inTraining <- createDataPartition(y = ml_df$outcome, p=.70, list=FALSE)
training <- ml_df[inTraining,]
testing <- ml_df[-inTraining,]

The first dead patient in the original dataset was on 220 place, so the below check is done to be sure that training and testing sets for the model have similar output class distribution.

training %>% select(gender, outcome) %>% tbl_summary(by = outcome)
Characteristic Died, N = 1221 Survived, N = 1411
gender
female 39 (32%) 73 (52%)
male 83 (68%) 68 (48%)

1 Statistics presented: n (%)

testing %>% select(gender, outcome) %>% tbl_summary(by = outcome)
Characteristic Died, N = 521 Survived, N = 601
gender
female 9 (17%) 30 (50%)
male 43 (83%) 30 (50%)

1 Statistics presented: n (%)

For the learning process Repeated Cross-Validation was used. The training process will be repeated 5 times, and each time the training set will be split to training and validation sets.

ctrl <- trainControl(method="repeatedcv", 
                     number = 10, 
                     repeats = 3)

SVM

For reproducibility.

set.seed(23)

Standarization of values for SVM Polynominal Classifier.

set.seed(23)
preProcValues <- preProcess(training, method = c("center", "scale"))
trainTransformed <- predict(preProcValues, training)
testTransformed <- predict(preProcValues, testing)

dataset <- rbind(trainTransformed, testTransformed)

Creating the classifier with training dataset.

set.seed(23)
svm_fit <- train(outcome ~ ., 
                data = trainTransformed, 
                method = "svmPoly", 
                trControl = ctrl,
                tuneGrid = data.frame(degree = 1,scale = 1, C = 1))
svm_classes <- predict(svm_fit, newdata = testTransformed)

svm_classes_prob <- predict(svm_fit, newdata = testTransformed, type = "prob")

confusionMatrix(data = svm_classes, testTransformed$outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Died Survived
##   Died       47        7
##   Survived    5       53
##                                           
##                Accuracy : 0.8929          
##                  95% CI : (0.8203, 0.9434)
##     No Information Rate : 0.5357          
##     P-Value [Acc > NIR] : 4.008e-16       
##                                           
##                   Kappa : 0.7852          
##                                           
##  Mcnemar's Test P-Value : 0.7728          
##                                           
##             Sensitivity : 0.9038          
##             Specificity : 0.8833          
##          Pos Pred Value : 0.8704          
##          Neg Pred Value : 0.9138          
##              Prevalence : 0.4643          
##          Detection Rate : 0.4196          
##    Detection Prevalence : 0.4821          
##       Balanced Accuracy : 0.8936          
##                                           
##        'Positive' Class : Died            
## 
learning_curve_dat <- function(data,
                                  test_prob = 0.3,
                                  proportion = (1:10)/10,
                                  train_method,
                                  ctrl = NULL,
                                  # train parameters
                                  ...)
  
{  
  proportion <- sort(unique(proportion))
  dataframe_size <- length(proportion) * 2
  n_size <- length(proportion)
  
  set.seed(23)
  inTraining <- createDataPartition(data$outcome, p = 1 - test_prob, list = FALSE)
  training <- data[inTraining,]
  testing <- data[-inTraining,]
  
  training_data_size <- nrow(training)

  
  learning_error_df <- data.frame(training_size = integer(dataframe_size),
                               data = character(dataframe_size),
                               error_value = numeric(dataframe_size))
  
  index <- 1
  
  for(i in seq(along = proportion)) {

    training_set <- training[1:floor((training_data_size * proportion[i])),]
    training_set_size <- nrow(training_set)
    set.seed(23)
    model <- train(outcome ~ .,
                   data = training_set,
                   method = train_method,
                   trControl = ctrl,
                   ...)

    # Training Data Error Value
    learning_error_df$training_size[index] <- training_set_size
    learning_error_df$data[index] <- "Training"
    learning_error_df$error_value[index] <- rmse(training_set$outcome, predict(model, newdata = training_set))
    
    # Testing Data Error Value
    learning_error_df$training_size[index + 1] <- training_set_size
    learning_error_df$data[index + 1] <- "Testing"
    learning_error_df$error_value[index + 1] <- rmse(testing$outcome, predict(model, newdata = testing))

    index <- index + 2
    }
  learning_error_df
}
# svm_learninig_curve <- learning_curve_dat(dataset, 
#                                           test_prob = 0.3, 
#                                           train_method = "svmPoly",
#                                           ctrl = ctrl,
#                                           tuneGrid = data.frame(degree = 1,scale = 1, C = 1))
# 
# ggplot(svm_learninig_curve, aes(x = training_size)) +
#   geom_line(aes(y = error_value, color = data), size=2.0) + 
#   theme_bw() +
#   labs(title = "Learning Curves",
#        x = "Training set size",
#        y = "RMSE value")
# svm_ROC <- roc(response = testing$outcome, 
#               predictor = svm_classes_prob[, "Died"],
#               levels = rev(levels(testing$outcome)),
#               plot = TRUE,
#               auc = TRUE,
#               print.auc = TRUE)

Naive Bayes

set.seed(23)
nb_fit <- train(outcome ~ ., 
                data = training, 
                method = "naive_bayes", 
                trControl = ctrl)
nb_classes <- predict(nb_fit, newdata = testing)

nb_classes_prob <- predict(nb_fit, newdata = testing, type = "prob")

confusionMatrix(data = nb_classes, testing$outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Died Survived
##   Died       48        3
##   Survived    4       57
##                                           
##                Accuracy : 0.9375          
##                  95% CI : (0.8755, 0.9745)
##     No Information Rate : 0.5357          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8742          
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9231          
##             Specificity : 0.9500          
##          Pos Pred Value : 0.9412          
##          Neg Pred Value : 0.9344          
##              Prevalence : 0.4643          
##          Detection Rate : 0.4286          
##    Detection Prevalence : 0.4554          
##       Balanced Accuracy : 0.9365          
##                                           
##        'Positive' Class : Died            
## 
nb_ROC <- roc(response = testing$outcome, 
              predictor = nb_classes_prob[, "Died"],
              levels = rev(levels(testing$outcome)),
              plot = TRUE,
              auc = TRUE,
              print.auc = TRUE)

nb_ROC
## 
## Call:
## roc.default(response = testing$outcome, predictor = nb_classes_prob[,     "Died"], levels = rev(levels(testing$outcome)), auc = TRUE,     plot = TRUE, print.auc = TRUE)
## 
## Data: nb_classes_prob[, "Died"] in 60 controls (testing$outcome Survived) < 52 cases (testing$outcome Died).
## Area under the curve: 0.9744

Random Forest

set.seed(17)
fit <- train(outcome ~ ., data = training, method = "rf", trControl = ctrl, ntree=10)
rfClasses <- predict(fit, newdata = testing)
confusionMatrix(data =rfClasses, testing$outcome)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Died Survived
##   Died       50        7
##   Survived    2       53
##                                           
##                Accuracy : 0.9196          
##                  95% CI : (0.8529, 0.9626)
##     No Information Rate : 0.5357          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8395          
##                                           
##  Mcnemar's Test P-Value : 0.1824          
##                                           
##             Sensitivity : 0.9615          
##             Specificity : 0.8833          
##          Pos Pred Value : 0.8772          
##          Neg Pred Value : 0.9636          
##              Prevalence : 0.4643          
##          Detection Rate : 0.4464          
##    Detection Prevalence : 0.5089          
##       Balanced Accuracy : 0.9224          
##                                           
##        'Positive' Class : Died            
##